import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import time
import numpy as np
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
import unicodedata
import re
import seaborn as sns
import inspect
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
Data includes various types of electric vehicles, manufactured by several automobile companies, showing purchasing trends from 2000 to 2022.
url = 'https://www.atlasevhub.com/materials/state-ev-registration-data/#data'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
links=[link.get('href') for link in soup.find_all('a') ]
data=[i for i in links if len(i.split('/'))>1 and i.split('/')[1]=='public']
data=[i for i in data if i.split('.')[-1]=='csv']
state_name=[i.split('/')[3].split('_')[0].upper() for i in data]
dt=[]
for i in data:
dt.append(pd.read_csv('https://www.atlasevhub.com/'+i))
The data is binned annually from 2000-2022. The earliest recorded year among all datasets is 2003. The latest recorded year is 2022. For organisation into a dataframe, cumulative EV sales for the year without any recorded data are assigned the value of min(0, cumulative EV sales for the previous year).
def padding(df,n):
df.loc[dtotal.shape[0]]=[2000,0,state_name[n]]
for i in range(2000,2023):
if i not in df['Date'].tolist():
df.loc[df.shape[0]]=[i,df.loc[df['Date']==i-1]['Count'].values[0],state_name[n]]
df=df.sort_values(by=['Date'])
df=df.set_index(pd.Series(range(df.shape[0])))
return df
dtotal=dt[0][['Registration Valid Date']]
dtotal['Registration Valid Date']=pd.to_datetime(dtotal['Registration Valid Date'])
dtotal.rename({'Registration Valid Date':'Date'},axis=1, inplace=True)
dtotal=dtotal.groupby(pd.to_datetime(dtotal['Date']).dt.year).count()
dtotal.rename({'Date':'Count'},axis=1, inplace=True)
dtotal['State']=state_name[0]
dtotal['Count']=dtotal['Count'].cumsum()
dtotal=dtotal.rename_axis('Date').reset_index()
dtotal=padding(dtotal,0)
for i in range(1,len(dt)):
dtem=dt[i][['Registration Valid Date']]
dtem['Registration Valid Date']=pd.to_datetime(dtem['Registration Valid Date'])
dtem.rename({'Registration Valid Date':'Date'},axis=1, inplace=True)
dtem=dtem.groupby(pd.to_datetime(dtem['Date']).dt.year).count()
dtem.rename({'Date':'Count'},axis=1, inplace=True)
dtem['State']=state_name[i]
dtem['Count']=dtem['Count'].cumsum()
dtem=dtem.rename_axis('Date').reset_index()
dtem=padding(dtem,i)
dtotal=pd.concat([dtotal,dtem],ignore_index=True)
dtotal['Date']=dtotal['Date'].astype(int)
dtotal.head()
# Export JSON file containing cumulative state-wise EV sales from 2000-2022
dtotal.to_json('dtotal.json',orient='records')
for i in range(len(state_name)):
if 'Vehicle Name' in dt[i].columns:
ds=dt[i][['Registration Valid Date','Vehicle Name']]
ds['Vehicle Name']=[i.split()[0] for i in ds['Vehicle Name']]
ds.rename(columns = {'Registration Valid Date':'Date','Vehicle Name':'Make'}, inplace = True)
elif 'Make' in dt[i].columns:
ds=dt[i][['Registration Valid Date','Make']]
ds['Make']=[i.split()[0] for i in ds['Make']]
ds.rename(columns = {'Registration Valid Date':'Date','Make':'Make'}, inplace = True)
ds.to_json(state_name[i]+'_data.json',orient='records')
dtotal=pd.read_json('dtotal.json', orient='records')
fig = px.choropleth(dtotal,
locations='State',
locationmode="USA-states",
scope="usa",
color='Count',
color_continuous_scale="Viridis_r",
animation_frame = 'Date'
)
fig.update_layout(
title_text="Cumulative Electric Vehicle sales in the United States",
title_xanchor="center",
title_font=dict(size=20),
title_x=0.5,
title_y=0.95,
geo=dict(scope='usa'),
width=700,
height=500,
margin=dict(t=100, b=0,r=100, l=0)
)
fig.update_layout(sliders=[{"currentvalue": {"prefix": "Year="}}])
fig.show()
n=pd.read_json(state_name[0]+'_data.json', orient='records')
n['Date']=n['Date'].dt.year
n['State']=[state_name[0]]*n['Date'].shape[0]
for i in state_name[1:]:
t=pd.read_json(i+'_data.json', orient='records')
t['Date']=t['Date'].dt.year
t['State']=[i]*t['Date'].shape[0]
n=pd.concat((n,t))
dropdown_s=widgets.Dropdown(
options=state_name,
value='MI',
description='State:',
disabled=False,
)
dropdown_y=widgets.Dropdown(
options=list(range(2000,2023,1)),
value=2020,
description='Year:',
disabled=False,
)
def plot_func(s,y):
nsy=n[(n['State']==s) & (n['Date']==y)]
fig = go.Figure()
fig = px.histogram(nsy,x='Make',title='EV sales in '+s+' in '+str(y)).update_xaxes(categoryorder='total descending')
fig.update_layout(width=700,height=500,margin=dict(t=30, b=0,r=0, l=0))
fig.show()
widgets.interact(plot_func, s = dropdown_s, y=dropdown_y);
Data downloaded on 10 August 2022
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))
driver.get('https://afdc.energy.gov/data_download')
Select(driver.find_element('xpath','//*[@id="download_data_api"]')).select_by_visible_text('Alternative fuel stations')
Select(driver.find_element('xpath','//*[@id="download_data_format"]')).select_by_visible_text('CSV (opens in Excel)')
Select(driver.find_element('xpath','//*[@id="download_data_api_options_alt_fuel_stations_fuel_type"]')).select_by_visible_text('All')
Select(driver.find_element('xpath','//*[@id="download_data_api_options_alt_fuel_stations_access"]')).select_by_visible_text('All')
Select(driver.find_element('xpath','//*[@id="download_data_api_options_alt_fuel_stations_status"]')).select_by_visible_text('All')
driver.find_element('xpath','//*[@id="download_api_user_first_name"]').send_keys('Aditya')
driver.find_element('xpath','//*[@id="download_api_user_last_name"]').send_keys('Sundar')
driver.find_element('xpath','//*[@id="download_api_user_email"]').send_keys('aditya.1094@gmail.com')
driver.find_element('xpath','//*[@id="download_api_user_terms_and_conditions"]').click()
time.sleep(3)
driver.find_element('xpath','//*[@id="main_content"]/form/input[3]').send_keys(Keys.ENTER)
fuels=pd.read_csv('alt-fuel-stations.csv',dtype='object')
ftype=fuels['Fuel Type Code'].unique()
marker_c=['black','blue','red','green','brown','purple','orange']
dropdown_ftype=widgets.Dropdown(
options=ftype,
value=ftype[0],
description='Fuel Type:',
disabled=False,
)
def plot_func(f):
ff=fuels[fuels['Fuel Type Code']==f]
fig = go.Figure(data=go.Scattergeo(
locationmode = 'USA-states',
lon = ff['Longitude'],
lat = ff['Latitude'],
text = ff['City']+', '+ff['State'],
mode = 'markers',
hoverinfo='text',
marker = dict(
size = 4,
symbol = 'square',
color=marker_c[np.where(ftype==f)[0][0]])))
fig.update_layout(
title = 'Alternative fuel stations in the USA',
geo = dict(
scope='usa',
projection_type='albers usa',
showland = True,
landcolor = "rgb(245, 245, 245)",
subunitcolor = "rgb(217, 217, 217)",
countrycolor = "rgb(217, 217, 217)",
countrywidth = 0.5,
subunitwidth = 0.5,
),width=700,height=500,margin=dict(t=40, b=0,r=0, l=0)
)
fig.show()
widgets.interact(plot_func, f = dropdown_ftype);
Data retrieved from https://ev-database.org/ on 11 August 2022
I was not able to find a database listing vehicle availability in the United States. For preliminary results, I explored a published databased listing 245 EVs in Europe. In the future, this kind of analysis will be extended to Evs in the USA.
url = 'https://ev-database.org/#sort:path~type~order=.rank~number~desc|range-slider-range:prev~next=0~1200|range-slider-acceleration:prev~next=2~23|range-slider-topspeed:prev~next=110~450|range-slider-battery:prev~next=10~200|range-slider-towweight:prev~next=0~2500|range-slider-fastcharge:prev~next=0~1500|paging:currentPage=0|paging:number=all'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
evcars=soup.find_all('div',class_='data-wrapper')
def car_model(c):
c=str(c).split('=')[3].split(' ')[:-1]
s=c[0].strip('""')+' '
for st in c[1:-1]:
s=s+st+' '
s=s+c[-1].strip('""')
return s
name,battery,towing,seats,acc,topspeed,Range,efficiency,fastcharge,costeuro,costpound=[],[],[],[],[],[],[],[],[],[],[]
for n in evcars:
temp=n.find_all('div')
name.append(car_model(temp[0]))
battery.append(float(str(temp[2].find_all('span')[0]).split('</')[0].split('>')[-1]))
tow=str(temp[3].find_all('span')[2]).split('</')[0].split('>')[-1]
towing.append(float(tow) if len(tow)>0 else np.nan)
seats.append(float(str(temp[3].find_all('span')[-1]).split('</')[0].split('>')[-1]))
acc.append(float(str(temp[4].find_all('span')[1]).split('</')[0].split('>')[-1].split(' ')[0]))
topspeed.append(float(str(temp[4].find_all('span')[3]).split('</')[0].split('>')[-1].split(' ')[0]))
Range.append(float(str(temp[4].find_all('span')[5]).split('</')[0].split('>')[-1].split(' ')[0]))
efficiency.append(float(str(temp[4].find_all('span')[7]).split('</')[0].split('>')[-1].split(' ')[0]))
temp2=str(temp[4].find_all('span')[9]).split('</')[0].split('>')[-1].split(' ')[0]
temp2=re.sub('[^0-9]','', temp2)
fastcharge.append(float(temp2) if len(temp2)>0 else np.nan)
temp2=str(temp[5].find_all('span',class_='price_buy')[0]).split('</')[0].split('>')[-1]
temp2=re.sub('[^0-9]','', temp2)
costeuro.append(float(temp2) if len(temp2)>0 else np.nan)
temp2=str(temp[5].find_all('span',class_='price_buy')[-1]).split('</')[0].split('>')[-1]
temp2=re.sub('[^0-9]','', temp2)
costpound.append(float(temp2) if len(temp2)>0 else np.nan)
evdata=pd.DataFrame(list(zip(name,battery,towing,seats,acc,topspeed,Range,efficiency,fastcharge,costeuro,costpound)),
columns =['Name', 'Battery(kWh)','TowCapacity(kg)','Seats','Acceleration(0-100)','TopSpeed(km/h)','Range(km)',
'Efficiency(km/h)','FastCharge(km/h)',
'CostEuro('+format(unicodedata.lookup("EURO SIGN"))+')',
'CostPound('+format(unicodedata.lookup("POUND SIGN"))+')'])
make=[]
for i in evdata['Name']:
make.append(i.split(' ')[0])
evdata['Make']=make
evdata.head()
evdata.to_json('Europe-EV.json',orient='records')
evdata=pd.read_json('Europe-EV.json', orient='records')
fig=px.imshow(evdata.corr().round(3),title="Correlations between features in the dataset for EV in Europe")
fig.update_layout(
title_x=0.5,
title_font=dict(size=16),
margin=dict(t=50, b=0,r=0, l=0)
)
fig.update_layout(coloraxis_colorbar_x=0.8)
fig.show()
fig = go.Figure(px.histogram(x=evdata['Make'],
text_auto=True,
labels={'x':''},
title='Make').update_xaxes(categoryorder='total descending'))
x_buttons = []
for ncol in evdata.columns[1:]:
x_buttons.append(dict(method='update',
label=ncol,
args=[{'x': [evdata[ncol]]},{'title':ncol}]
)
)
fig.update_traces(marker_line_width=1,marker_line_color="white")
fig.update_layout(updatemenus=[dict(buttons=x_buttons, direction='down', x=0.98, y=0.99)],
width=600,height=500,
title_x=0.5,title_y=0.01,
title_font=dict(size=16),
margin=dict(t=50, b=0,r=0, l=0),
font=dict(
size=14,
))
fig.show()
dropdown_x=widgets.Dropdown(
options=evdata.columns[1:],
value=evdata.columns[1],
description='Bottom:',
disabled=False,
)
dropdown_y=widgets.Dropdown(
options=evdata.columns[1:],
value=evdata.columns[1],
description='Left:',
disabled=False,
)
def plot_func(x,y):
p=sns.jointplot(x=evdata[x], y=evdata[y], kind="hex", color="#4CB391")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel(x, fontsize=18)
plt.ylabel(y, fontsize=18)
plt.show()
widgets.interact(plot_func, x = dropdown_x, y=dropdown_y);
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
evdata=pd.read_json('Europe-EV.json', orient='records')
evdata=evdata.fillna(evdata.mean())
features=evdata.drop(columns=['CostEuro(€)','CostPound(£)','Make','Name'])
target=evdata['CostPound(£)']
# This cell executes a grid search to determine the best hyperparameters for LASSO regression.
las = Lasso(max_iter=10000, tol=1e-5,alpha=1)
pipe = Pipeline([("scaler", StandardScaler()), ("las", las)])
param_grid = {
"las__alpha": np.logspace(-5, 2, 8),
}
search = GridSearchCV(pipe, param_grid, n_jobs=2,return_train_score=True,scoring='r2',cv=RepeatedKFold(n_splits=5, n_repeats=5,random_state=11))
search.fit(features, target)
# print(search.best_params_)
ind=search.best_index_
print('training R2 = ',search.cv_results_['mean_train_score'][ind],u'\u00B1',search.cv_results_['std_train_score'][ind])
print('testing R2 = ',search.cv_results_['mean_test_score'][ind],u'\u00B1',search.cv_results_['std_test_score'][ind])
training R2 = 0.5913728382279342 ± 0.01889823798741459 testing R2 = 0.5138643421760096 ± 0.11727341431426351
# This cell trains the entire dataset using the best LASSO hyperparameters.
las = Lasso(max_iter=10000, tol=1e-5,alpha=search.best_index_)
las.fit(features,target)
ypred=las.predict(features)
r2=np.round(r2_score(target,ypred),3)
price_min=target.min()-1000
price_max=target.max()+1000
fig=px.scatter(x=target,y=ypred,
range_x=[price_min,price_max],
range_y=[price_min,price_max],
width=500, height=500)
fig.add_scatter(x=np.linspace(price_min,price_max,1000),
y=np.linspace(price_min,price_max,1000),line_color='black',showlegend=False,line_dash='dash')
fig.update_xaxes(tickvals=list(range(20000,180000,20000)),title='True value of CostPound(£)')
fig.update_yaxes(tickvals=list(range(20000,180000,20000)),title='Predicted value of CostPound(£)')
fig.update_layout(title='LASSO regression', title_x=0.5)
fig.add_annotation(
text='R2 = '+str(r2),
font=dict(
size=12,
color='red'
),
showarrow=False,
xref="paper", yref="paper",
x=0.8, y=0.97
)
fig.show()
# This cell executes a grid search to determine the best hyperparameters forgradient boosting regression.
# Optimisation is more time consuming. The user can directly move to the next cell where optimised parameteres are already
# provided for convenience.
t1=time.time()
las_gbr = GradientBoostingRegressor(learning_rate=0.01,max_depth=3,n_estimators=100)
pipe = Pipeline([("scaler", StandardScaler()), ("las", las_gbr)])
param_grid = {
"las__learning_rate": np.logspace(-5, 0, 6),
'las__max_depth':range(1,5),
'las__n_estimators':range(100,500,100)
}
search = GridSearchCV(pipe, param_grid, n_jobs=4,return_train_score=True,scoring='r2',cv=RepeatedKFold(n_splits=5, n_repeats=5,random_state=11))
search.fit(features, target)
print('time =',time.time()-t1)
time = 108.07783341407776
# This cell trains the entire dataset using the best gradient boosting hyperparameters. Since the grid search is more
# time-consuming, the optimal hyperparameters are already provided with the training code.
fig = make_subplots(rows=1, cols=2)
# The following line can be uncommented if the previous cell is executed for grid search optimisation.
# las_gbr=GradientBoostingRegressor(learning_rate=search.best_params_['las__learning_rate'],max_depth=search.best_params_['las__max_depth'],n_estimators=search.best_params_['las__n_estimators'])
las_gbr=GradientBoostingRegressor(learning_rate=0.01,max_depth=4,n_estimators=400)
las_gbr.fit(features,target)
ypred=las_gbr.predict(features)
r2=np.round(r2_score(target,ypred),3)
price_min=target.min()-1000
price_max=target.max()+1000
fig.add_scatter(x=np.linspace(price_min,price_max,1000),
y=np.linspace(price_min,price_max,1000),line_color='black',showlegend=False,line_dash='dash', row=1, col=1
)
fig.add_trace(go.Scatter(x=target,y=ypred,
mode='markers',showlegend=False,
), row=1, col=1)
fig.update_xaxes(tickvals=list(range(20000,180000,20000)),title='True value of CostPound(£)',row=1,col=1)
fig.update_yaxes(tickvals=list(range(20000,180000,20000)),title='Predicted value of CostPound(£)',row=1,col=1)
fig.add_annotation(
text='R2 = '+str(r2),
font=dict(
size=12,
color='red'
),
showarrow=False,
xref="paper", yref="paper",
x=0.35, y=0.97
)
fig.update_layout(title='Regression model & feature importances',title_x=0.5)
fig.add_trace(go.Bar(x=las_gbr.feature_importances_,y=list(range(features.shape[1])),text=features.columns,
orientation='h',showlegend=False),row=1,col=2)
fig.update_yaxes(tickmode='array',tickvals=list(range(features.shape[1])),ticktext=['']*8,row=1,col=2)
fig.show()